from bs4 import BeautifulSoup
import urllib.request
import time
time.sleep(2)
import re
import pdfkit
import pandas


headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
#导入新闻id
newsid = open('D://nanfang/20200403/id.txt','r',encoding='utf-8')
idlist = newsid.read().split('\n')

title = []
up_time = []
url =  []
try:
    for i in idlist:
        #获取新闻内容
        newsurl = 'http://www.infzm.com/contents/'+i
        html = urllib.request.Request(url=newsurl,headers=headers)
        data = urllib.request.urlopen(html).read().decode('utf-8','ignore')
        url.append(newsurl)
        soup = BeautifulSoup(data,'lxml')
        newstitle = soup.select('title')[0].get_text()
        title.append(newstitle)
        print(len(title))

        pat = 'data-time="(\d+-\d+-\d+ \d+:\d+:\d+)">'
        newstime = str(re.compile(pat).findall(data)).replace("['","")
        time = newstime.replace("']","")


        up_time.append(time)
        print(len(up_time))

    news = {'title':title,'url':url,'time':up_time}
    all_data = pandas.DataFrame(news)
    all_data.to_csv('D://nanfang/20200403/nanfang.csv', encoding='utf-8-sig')
    print(all_data)
except Exception as err:
    print(err)
